1. Clearing the workspace

rm(list=ls())

2. Loading the Required Packages

library(tidyverse)
library(plotly)
library(vcfR)
library(adegenet)
library(hierfstat)
library(pegas)
library(poppr)
library(boot)

3. Loading the Data

fst_windowed = read.table(file = "/Users/vicegill/Documents/Project_2022/RUS_EUR.windowed.weir.fst", sep = "\t",header = TRUE)

4. Plotting the Fst according to the position on the chromosome

fst_windowed$index=1:nrow(fst_windowed)
threshold_for_fst<- quantile(fst_windowed$WEIGHTED_FST, c(0.975,0.995,0.9999),na.rm=TRUE)
fst_windowed <- fst_windowed %>% mutate(outlier = ifelse(fst_windowed$WEIGHTED_FST > threshold_for_fst[2], "outlier", "background"))
tallied_data <- fst_windowed %>% group_by(outlier) %>% tally()
weak <- fst_windowed[fst_windowed$WEIGHTED_FST > threshold_for_fst[1],]
medium <- fst_windowed[fst_windowed$WEIGHTED_FST > threshold_for_fst[2],]
high <- fst_windowed[fst_windowed$WEIGHTED_FST > threshold_for_fst[3],]

5. Filtering Chromosome 5

As it has the highest Fst Value and the table of very High Fst Differentiated SNP’s

fst_windowed_5 = fst_windowed %>% filter(CHROM==5)
fst_windowed_5$index=1:nrow(fst_windowed_5)
fst_plot = fst_windowed_5 %>%    ggplot(aes(x=index,y=WEIGHTED_FST,color=outlier)) +
           geom_point(alpha=3/4) +
           xlab("Index Position") + 
           ylab("Weighted Fst Value") +
           ggtitle(label = "Fst Value in the Chromosome 5",subtitle = "Project 2023")
fst_plot <- ggplotly(fst_plot)
fst_pot <- layout(fst_plot, height = 200, width = 300) 
fst_plot
high_5 = high <- fst_windowed_5[fst_windowed_5$WEIGHTED_FST > threshold_for_fst[3],]
print(high_5)
##     CHROM BIN_START  BIN_END N_VARIANTS WEIGHTED_FST MEAN_FST index outlier
## 726     5  36750001 36800000          2     0.444164 0.440951   726 outlier
## 734     5  37150001 37200000          2     0.320195 0.318490   734 outlier

6. Zooming in the Region of Maximum Differentiation

Filtering Chromosome 5 as it has the highest Fst Value.High Differention index is 726 and 754 so we can select +/- 50 sites within this interval (676-804)

fst_windowed_5 = fst_windowed %>% filter(CHROM==5)
fst_windowed_5$index=1:nrow(fst_windowed_5)

7. Confidence interval by bootstraping

First Graph include the Quantile Line obtained from the quantile func showing the fst value which is higher and lower of 95 per of SNP’s

fst_value <- fst_windowed$WEIGHTED_FST
quantile_fst = quantile(fst_value,c(0.975,0.025),na.rm = TRUE)
plot(density(fst_value),main="Density Plot of the Fst Values")

abline(v= 0.03573623)
abline(v= -0.02125245)

calculate_fst <- function(data, indices) {
  # Extract the Fst values using the indices
  sampled_fst <- data[indices]
  
  # Calculate the desired statistic, such as mean or median
  result <- median(sampled_fst)  # Adjust as needed
  
  return(result)
} 
# Set the number of bootstrap samples
n_boot <- 10000  # Adjust as desired

# Run the bootstrap procedure
boot_result <- boot(fst_value, calculate_fst, R = n_boot)

# Calculate the confidence interval
ci <- boot.ci(boot_result, type = "basic")  # Adjust the type as desired

# Extract the confidence interval bounds
ci_lower <- ci$basic[4]
ci_upper <- ci$basic[5]

# Print the confidence interval
cat("Confidence Interval: [", ci_lower, ", ", ci_upper, "]\n")
## Confidence Interval: [ 0.00167693 ,  0.00198754 ]

8. Plotting fst values into the region +/- 50 of Highly Differentiated SNP’s on chromosome 5

fst_plot = fst_windowed_5 %>% filter(index>=676 &index<= 804) %>%   
           ggplot(aes(x=index,y=WEIGHTED_FST,color=outlier,shape=(WEIGHTED_FST > ci_upper))) +
           geom_point() +
           xlab("Index Position") +
           ylab("Weighted Fst Value") +
           ggtitle(label = "Fst Value in the Chromosome 5",subtitle = "Project 2023") +
           scale_shape_manual(name="Shape Legend",values = c(1,3),labels=c("Background by BS","Outlier by BS")) + scale_color_manual(name="Color Legend", values = c("#E76F51","#2A9D8F"),label=c("background by Q","Outlier by Q"))+ geom_vline(xintercept = 716, color = "black", linetype = "dashed", size =0.5) +
  geom_vline(xintercept = 748, color = "black", linetype = "dashed", size =0.5) 
fst_plot

print(paste("Lines Showing the Selected Region"))
## [1] "Lines Showing the Selected Region"

9. Finding the chromosome range where we should do the local PCA.

fst_windowed_5 = fst_windowed_5 %>% mutate(mean_pos=(BIN_START+BIN_END)/2)
chrom_pos_lower=as.integer(fst_windowed_5[fst_windowed_5$index==716,]$mean_pos)
chrom_pos_upper=as.integer(fst_windowed_5[fst_windowed_5$index==748,]$mean_pos)
total_snps = fst_windowed_5 %>% filter(index >= 716 & index <= 748)
total_snps=sum(total_snps$N_VARIANTS)
print(paste("Chromosome Position Range ", chrom_pos_lower , "- ", chrom_pos_upper))
## [1] "Chromosome Position Range  35925000 -  37875000"
print(paste("Total number of variants within this range is ",total_snps))
## [1] "Total number of variants within this range is  84"

10. Bash Code to extract the relevant snp from vcf file

#!/bin/bash


cd /dss/dsshome1/09/ra78pec/Local_PCA

 vcftools --vcf /dss/dsshome1/09/ra78pec/data/subset_filtered_prunned.vcf \
          --chr 5 \
          --from-bp 35925000 \
          --to-bp 37875000 \
          --recode \
          --out /dss/dsshome1/09/ra78pec/data/chrom5